Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe characteristics of the cell nuclei present in the image. source: https://www.kaggle.com/uciml/breast-cancer-wisconsin-data
wbcd means ‘wisconsin breast cancer data’
wbcd <- read.csv("breast_cancer.csv", header=T, stringsAsFactors=F)
wbcd$X <- NULL
wbcd <- wbcd[,-1]
wbcd$diagnosis <- factor(ifelse(wbcd$diagnosis=="B","Benign","Malignant"))
str(wbcd)
## 'data.frame': 569 obs. of 31 variables:
## $ diagnosis : Factor w/ 2 levels "Benign","Malignant": 2 2 2 2 2 2 2 2 2 2 ...
## $ radius_mean : num 18 20.6 19.7 11.4 20.3 ...
## $ texture_mean : num 10.4 17.8 21.2 20.4 14.3 ...
## $ perimeter_mean : num 122.8 132.9 130 77.6 135.1 ...
## $ area_mean : num 1001 1326 1203 386 1297 ...
## $ smoothness_mean : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
## $ compactness_mean : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
## $ concavity_mean : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
## $ concave.points_mean : num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
## $ symmetry_mean : num 0.242 0.181 0.207 0.26 0.181 ...
## $ fractal_dimension_mean : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
## $ radius_se : num 1.095 0.543 0.746 0.496 0.757 ...
## $ texture_se : num 0.905 0.734 0.787 1.156 0.781 ...
## $ perimeter_se : num 8.59 3.4 4.58 3.44 5.44 ...
## $ area_se : num 153.4 74.1 94 27.2 94.4 ...
## $ smoothness_se : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
## $ compactness_se : num 0.049 0.0131 0.0401 0.0746 0.0246 ...
## $ concavity_se : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
## $ concave.points_se : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
## $ symmetry_se : num 0.03 0.0139 0.0225 0.0596 0.0176 ...
## $ fractal_dimension_se : num 0.00619 0.00353 0.00457 0.00921 0.00511 ...
## $ radius_worst : num 25.4 25 23.6 14.9 22.5 ...
## $ texture_worst : num 17.3 23.4 25.5 26.5 16.7 ...
## $ perimeter_worst : num 184.6 158.8 152.5 98.9 152.2 ...
## $ area_worst : num 2019 1956 1709 568 1575 ...
## $ smoothness_worst : num 0.162 0.124 0.144 0.21 0.137 ...
## $ compactness_worst : num 0.666 0.187 0.424 0.866 0.205 ...
## $ concavity_worst : num 0.712 0.242 0.45 0.687 0.4 ...
## $ concave.points_worst : num 0.265 0.186 0.243 0.258 0.163 ...
## $ symmetry_worst : num 0.46 0.275 0.361 0.664 0.236 ...
## $ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.0768 ...
summary(wbcd)
## diagnosis radius_mean texture_mean perimeter_mean
## Benign :357 Min. : 6.981 Min. : 9.71 Min. : 43.79
## Malignant:212 1st Qu.:11.700 1st Qu.:16.17 1st Qu.: 75.17
## Median :13.370 Median :18.84 Median : 86.24
## Mean :14.127 Mean :19.29 Mean : 91.97
## 3rd Qu.:15.780 3rd Qu.:21.80 3rd Qu.:104.10
## Max. :28.110 Max. :39.28 Max. :188.50
## area_mean smoothness_mean compactness_mean concavity_mean
## Min. : 143.5 Min. :0.05263 Min. :0.01938 Min. :0.00000
## 1st Qu.: 420.3 1st Qu.:0.08637 1st Qu.:0.06492 1st Qu.:0.02956
## Median : 551.1 Median :0.09587 Median :0.09263 Median :0.06154
## Mean : 654.9 Mean :0.09636 Mean :0.10434 Mean :0.08880
## 3rd Qu.: 782.7 3rd Qu.:0.10530 3rd Qu.:0.13040 3rd Qu.:0.13070
## Max. :2501.0 Max. :0.16340 Max. :0.34540 Max. :0.42680
## concave.points_mean symmetry_mean fractal_dimension_mean
## Min. :0.00000 Min. :0.1060 Min. :0.04996
## 1st Qu.:0.02031 1st Qu.:0.1619 1st Qu.:0.05770
## Median :0.03350 Median :0.1792 Median :0.06154
## Mean :0.04892 Mean :0.1812 Mean :0.06280
## 3rd Qu.:0.07400 3rd Qu.:0.1957 3rd Qu.:0.06612
## Max. :0.20120 Max. :0.3040 Max. :0.09744
## radius_se texture_se perimeter_se area_se
## Min. :0.1115 Min. :0.3602 Min. : 0.757 Min. : 6.802
## 1st Qu.:0.2324 1st Qu.:0.8339 1st Qu.: 1.606 1st Qu.: 17.850
## Median :0.3242 Median :1.1080 Median : 2.287 Median : 24.530
## Mean :0.4052 Mean :1.2169 Mean : 2.866 Mean : 40.337
## 3rd Qu.:0.4789 3rd Qu.:1.4740 3rd Qu.: 3.357 3rd Qu.: 45.190
## Max. :2.8730 Max. :4.8850 Max. :21.980 Max. :542.200
## smoothness_se compactness_se concavity_se
## Min. :0.001713 Min. :0.002252 Min. :0.00000
## 1st Qu.:0.005169 1st Qu.:0.013080 1st Qu.:0.01509
## Median :0.006380 Median :0.020450 Median :0.02589
## Mean :0.007041 Mean :0.025478 Mean :0.03189
## 3rd Qu.:0.008146 3rd Qu.:0.032450 3rd Qu.:0.04205
## Max. :0.031130 Max. :0.135400 Max. :0.39600
## concave.points_se symmetry_se fractal_dimension_se
## Min. :0.000000 Min. :0.007882 Min. :0.0008948
## 1st Qu.:0.007638 1st Qu.:0.015160 1st Qu.:0.0022480
## Median :0.010930 Median :0.018730 Median :0.0031870
## Mean :0.011796 Mean :0.020542 Mean :0.0037949
## 3rd Qu.:0.014710 3rd Qu.:0.023480 3rd Qu.:0.0045580
## Max. :0.052790 Max. :0.078950 Max. :0.0298400
## radius_worst texture_worst perimeter_worst area_worst
## Min. : 7.93 Min. :12.02 Min. : 50.41 Min. : 185.2
## 1st Qu.:13.01 1st Qu.:21.08 1st Qu.: 84.11 1st Qu.: 515.3
## Median :14.97 Median :25.41 Median : 97.66 Median : 686.5
## Mean :16.27 Mean :25.68 Mean :107.26 Mean : 880.6
## 3rd Qu.:18.79 3rd Qu.:29.72 3rd Qu.:125.40 3rd Qu.:1084.0
## Max. :36.04 Max. :49.54 Max. :251.20 Max. :4254.0
## smoothness_worst compactness_worst concavity_worst concave.points_worst
## Min. :0.07117 Min. :0.02729 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.11660 1st Qu.:0.14720 1st Qu.:0.1145 1st Qu.:0.06493
## Median :0.13130 Median :0.21190 Median :0.2267 Median :0.09993
## Mean :0.13237 Mean :0.25427 Mean :0.2722 Mean :0.11461
## 3rd Qu.:0.14600 3rd Qu.:0.33910 3rd Qu.:0.3829 3rd Qu.:0.16140
## Max. :0.22260 Max. :1.05800 Max. :1.2520 Max. :0.29100
## symmetry_worst fractal_dimension_worst
## Min. :0.1565 Min. :0.05504
## 1st Qu.:0.2504 1st Qu.:0.07146
## Median :0.2822 Median :0.08004
## Mean :0.2901 Mean :0.08395
## 3rd Qu.:0.3179 3rd Qu.:0.09208
## Max. :0.6638 Max. :0.20750
head(wbcd,10)
## diagnosis radius_mean texture_mean perimeter_mean area_mean
## 1 Malignant 17.99 10.38 122.80 1001.0
## 2 Malignant 20.57 17.77 132.90 1326.0
## 3 Malignant 19.69 21.25 130.00 1203.0
## 4 Malignant 11.42 20.38 77.58 386.1
## 5 Malignant 20.29 14.34 135.10 1297.0
## 6 Malignant 12.45 15.70 82.57 477.1
## 7 Malignant 18.25 19.98 119.60 1040.0
## 8 Malignant 13.71 20.83 90.20 577.9
## 9 Malignant 13.00 21.82 87.50 519.8
## 10 Malignant 12.46 24.04 83.97 475.9
## smoothness_mean compactness_mean concavity_mean concave.points_mean
## 1 0.11840 0.27760 0.30010 0.14710
## 2 0.08474 0.07864 0.08690 0.07017
## 3 0.10960 0.15990 0.19740 0.12790
## 4 0.14250 0.28390 0.24140 0.10520
## 5 0.10030 0.13280 0.19800 0.10430
## 6 0.12780 0.17000 0.15780 0.08089
## 7 0.09463 0.10900 0.11270 0.07400
## 8 0.11890 0.16450 0.09366 0.05985
## 9 0.12730 0.19320 0.18590 0.09353
## 10 0.11860 0.23960 0.22730 0.08543
## symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se
## 1 0.2419 0.07871 1.0950 0.9053 8.589
## 2 0.1812 0.05667 0.5435 0.7339 3.398
## 3 0.2069 0.05999 0.7456 0.7869 4.585
## 4 0.2597 0.09744 0.4956 1.1560 3.445
## 5 0.1809 0.05883 0.7572 0.7813 5.438
## 6 0.2087 0.07613 0.3345 0.8902 2.217
## 7 0.1794 0.05742 0.4467 0.7732 3.180
## 8 0.2196 0.07451 0.5835 1.3770 3.856
## 9 0.2350 0.07389 0.3063 1.0020 2.406
## 10 0.2030 0.08243 0.2976 1.5990 2.039
## area_se smoothness_se compactness_se concavity_se concave.points_se
## 1 153.40 0.006399 0.04904 0.05373 0.01587
## 2 74.08 0.005225 0.01308 0.01860 0.01340
## 3 94.03 0.006150 0.04006 0.03832 0.02058
## 4 27.23 0.009110 0.07458 0.05661 0.01867
## 5 94.44 0.011490 0.02461 0.05688 0.01885
## 6 27.19 0.007510 0.03345 0.03672 0.01137
## 7 53.91 0.004314 0.01382 0.02254 0.01039
## 8 50.96 0.008805 0.03029 0.02488 0.01448
## 9 24.32 0.005731 0.03502 0.03553 0.01226
## 10 23.94 0.007149 0.07217 0.07743 0.01432
## symmetry_se fractal_dimension_se radius_worst texture_worst
## 1 0.03003 0.006193 25.38 17.33
## 2 0.01389 0.003532 24.99 23.41
## 3 0.02250 0.004571 23.57 25.53
## 4 0.05963 0.009208 14.91 26.50
## 5 0.01756 0.005115 22.54 16.67
## 6 0.02165 0.005082 15.47 23.75
## 7 0.01369 0.002179 22.88 27.66
## 8 0.01486 0.005412 17.06 28.14
## 9 0.02143 0.003749 15.49 30.73
## 10 0.01789 0.010080 15.09 40.68
## perimeter_worst area_worst smoothness_worst compactness_worst
## 1 184.60 2019.0 0.1622 0.6656
## 2 158.80 1956.0 0.1238 0.1866
## 3 152.50 1709.0 0.1444 0.4245
## 4 98.87 567.7 0.2098 0.8663
## 5 152.20 1575.0 0.1374 0.2050
## 6 103.40 741.6 0.1791 0.5249
## 7 153.20 1606.0 0.1442 0.2576
## 8 110.60 897.0 0.1654 0.3682
## 9 106.20 739.3 0.1703 0.5401
## 10 97.65 711.4 0.1853 1.0580
## concavity_worst concave.points_worst symmetry_worst
## 1 0.7119 0.2654 0.4601
## 2 0.2416 0.1860 0.2750
## 3 0.4504 0.2430 0.3613
## 4 0.6869 0.2575 0.6638
## 5 0.4000 0.1625 0.2364
## 6 0.5355 0.1741 0.3985
## 7 0.3784 0.1932 0.3063
## 8 0.2678 0.1556 0.3196
## 9 0.5390 0.2060 0.4378
## 10 1.1050 0.2210 0.4366
## fractal_dimension_worst
## 1 0.11890
## 2 0.08902
## 3 0.08758
## 4 0.17300
## 5 0.07678
## 6 0.12440
## 7 0.08368
## 8 0.11510
## 9 0.10720
## 10 0.20750
There are many ways to draw a correalation plot!
For practice, I applied different function to each data (mean, se, worst)
library(PerformanceAnalytics)
chart.Correlation(wbcd[,c(2:11)],histogram=TRUE, col="grey10", pch=1, main="Cancer Mean")
library(psych)
pairs.panels(wbcd[,c(12:21)], ellipses=TRUE, pch=1, lm=TRUE, cex.cor=1, smoother=F, stars = T, main="Cancer SE")
library(ggplot2)
library(GGally)
ggpairs(wbcd[,c(22:31)])+ theme_bw()+
labs(title="Cancer Worst")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=13))
I think viewing plot with diagnosis included is much more important than combined data[3-1].
library(ggplot2)
library(GGally)
ggpairs(wbcd[,c(2:11,1)], aes(color=diagnosis, alpha=0.75), lower=list(continuous="smooth"))+ theme_bw()+
labs(title="Cancer Mean")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=12))
ggpairs(wbcd[,c(12:21,1)], aes(color=diagnosis, alpha=0.75), lower=list(continuous="smooth"))+ theme_bw()+
labs(title="Cancer SE")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=12))
ggpairs(wbcd[,c(22:31,1)], aes(color=diagnosis, alpha=0.75), lower=list(continuous="smooth"))+ theme_bw()+
labs(title="Cancer Worst")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=12))
By ggcorr, we can see the correlation value more directly than above graph.
ggcorr(wbcd[,c(2:11)], name = "corr", label = TRUE)+
theme(legend.position="none")+
labs(title="Cancer Mean")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=12))
ggcorr(wbcd[,c(12:21)], name = "corr", label = TRUE)+
theme(legend.position="none")+
labs(title="Cancer SE")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=12))
ggcorr(wbcd[,c(22:31)], name = "corr", label = TRUE)+
theme(legend.position="none")+
labs(title="Cancer Worst")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=12))
Too many variables can cause such problems below
Increased computer throughput
Too complex visualization problems
Decrease efficiency by including variables that have no effect on the analysis
Make data interpretation difficult
If you see the ggcorr plot above[3-3], high correlation value means it has “multicollinearity” between variables.
-> Use one main component for model development by reduct the variables with high correlation.
PCA uses standardized data so that it can avoid data distortion caused by scale difference
wbcd_pca <- transform(wbcd)
all_prcomp <- prcomp(wbcd_pca[,-1], scale = TRUE)
summary(all_prcomp)
## Importance of components%s:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 3.6444 2.3857 1.67867 1.40735 1.28403 1.09880
## Proportion of Variance 0.4427 0.1897 0.09393 0.06602 0.05496 0.04025
## Cumulative Proportion 0.4427 0.6324 0.72636 0.79239 0.84734 0.88759
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.82172 0.69037 0.6457 0.59219 0.5421 0.51104
## Proportion of Variance 0.02251 0.01589 0.0139 0.01169 0.0098 0.00871
## Cumulative Proportion 0.91010 0.92598 0.9399 0.95157 0.9614 0.97007
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 0.49128 0.39624 0.30681 0.28260 0.24372 0.22939
## Proportion of Variance 0.00805 0.00523 0.00314 0.00266 0.00198 0.00175
## Cumulative Proportion 0.97812 0.98335 0.98649 0.98915 0.99113 0.99288
## PC19 PC20 PC21 PC22 PC23 PC24
## Standard deviation 0.22244 0.17652 0.1731 0.16565 0.15602 0.1344
## Proportion of Variance 0.00165 0.00104 0.0010 0.00091 0.00081 0.0006
## Cumulative Proportion 0.99453 0.99557 0.9966 0.99749 0.99830 0.9989
## PC25 PC26 PC27 PC28 PC29 PC30
## Standard deviation 0.12442 0.09043 0.08307 0.03987 0.02736 0.01153
## Proportion of Variance 0.00052 0.00027 0.00023 0.00005 0.00002 0.00000
## Cumulative Proportion 0.99942 0.99969 0.99992 0.99997 1.00000 1.00000
screeplot(all_prcomp, npcs=10, type="lines")
mean_prcomp <- prcomp(wbcd_pca[,c(2:11)], scale = TRUE)
summary(mean_prcomp)
## Importance of components%s:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.3406 1.5870 0.93841 0.7064 0.61036 0.35234
## Proportion of Variance 0.5479 0.2519 0.08806 0.0499 0.03725 0.01241
## Cumulative Proportion 0.5479 0.7997 0.88779 0.9377 0.97495 0.98736
## PC7 PC8 PC9 PC10
## Standard deviation 0.28299 0.18679 0.10552 0.01680
## Proportion of Variance 0.00801 0.00349 0.00111 0.00003
## Cumulative Proportion 0.99537 0.99886 0.99997 1.00000
screeplot(mean_prcomp, npcs=4, type="lines")
print(mean_prcomp)
## Standard deviations (1, .., p=10):
## [1] 2.34063837 1.58704555 0.93841099 0.70640600 0.61035989 0.35233755
## [7] 0.28299348 0.18678810 0.10552469 0.01680196
##
## Rotation (n x k) = (10 x 10):
## PC1 PC2 PC3 PC4
## radius_mean -0.36393793 0.313929073 -0.12442759 0.029558858
## texture_mean -0.15445113 0.147180909 0.95105659 0.008916084
## perimeter_mean -0.37604434 0.284657885 -0.11408360 0.013458069
## area_mean -0.36408585 0.304841714 -0.12337786 0.013442682
## smoothness_mean -0.23248053 -0.401962324 -0.16653247 -0.107802033
## compactness_mean -0.36444206 -0.266013147 0.05827786 -0.185700413
## concavity_mean -0.39574849 -0.104285968 0.04114649 -0.166653523
## concave.points_mean -0.41803840 -0.007183605 -0.06855383 -0.072983951
## symmetry_mean -0.21523797 -0.368300910 0.03672364 0.892998475
## fractal_dimension_mean -0.07183744 -0.571767700 0.11358395 -0.349331790
## PC5 PC6 PC7 PC8
## radius_mean -0.031067022 0.264180150 -0.04418839 0.084834062
## texture_mean -0.219922761 0.032206572 0.02055748 -0.007126797
## perimeter_mean -0.005945081 0.237819464 -0.08336923 0.089258879
## area_mean -0.019341222 0.331707454 0.26118796 0.144609749
## smoothness_mean -0.843745292 -0.062225368 0.01129197 0.170503128
## compactness_mean 0.240182967 -0.005271104 -0.80380484 0.063980134
## concavity_mean 0.312533244 -0.601467155 0.36713629 0.449573315
## concave.points_mean -0.009180198 -0.265613395 0.14131308 -0.850918762
## symmetry_mean 0.112888068 0.061957003 0.04790201 0.016455606
## fractal_dimension_mean 0.264878077 0.567918997 0.34521359 -0.065259461
## PC9 PC10
## radius_mean 0.474425305 -0.6690714888
## texture_mean 0.004212629 0.0002497826
## perimeter_mean 0.380167210 0.7404905337
## area_mean -0.747347357 -0.0323589585
## smoothness_mean 0.005847386 0.0036904058
## compactness_mean -0.218732407 -0.0527527802
## concavity_mean 0.081170670 -0.0103668020
## concave.points_mean -0.022024652 -0.0037475480
## symmetry_mean 0.009067850 0.0014669472
## fractal_dimension_mean 0.129667491 0.0070573477
se_prcomp <- prcomp(wbcd_pca[,c(12:21)], scale = TRUE)
summary(se_prcomp)
## Importance of components%s:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.1779 1.4406 1.1245 0.77095 0.75991 0.57939
## Proportion of Variance 0.4743 0.2075 0.1264 0.05944 0.05775 0.03357
## Cumulative Proportion 0.4743 0.6819 0.8083 0.86774 0.92548 0.95905
## PC7 PC8 PC9 PC10
## Standard deviation 0.43512 0.3962 0.20436 0.14635
## Proportion of Variance 0.01893 0.0157 0.00418 0.00214
## Cumulative Proportion 0.97798 0.9937 0.99786 1.00000
screeplot(se_prcomp, npcs=4, type="lines")
print(se_prcomp)
## Standard deviations (1, .., p=10):
## [1] 2.1779279 1.4405579 1.1244649 0.7709473 0.7599129 0.5793947 0.4351151
## [8] 0.3961933 0.2043629 0.1463479
##
## Rotation (n x k) = (10 x 10):
## PC1 PC2 PC3 PC4
## radius_se -0.3455917 0.44035402 0.08078489 0.04864424
## texture_se -0.1886093 -0.15339415 0.59152980 -0.26297794
## perimeter_se -0.3574809 0.42030257 0.05877767 -0.01002982
## area_se -0.3040197 0.50021113 0.02483694 0.07280027
## smoothness_se -0.2124504 -0.27095295 0.42747680 0.79615347
## compactness_se -0.3747987 -0.24262835 -0.25680860 -0.08700675
## concavity_se -0.3555528 -0.22912114 -0.33819846 -0.10133141
## concave.points_se -0.3857430 -0.08499145 -0.22956424 0.04019019
## symmetry_se -0.2363156 -0.19857350 0.43932087 -0.51576918
## fractal_dimension_se -0.3287895 -0.35250198 -0.17529054 0.06118979
## PC5 PC6 PC7 PC8
## radius_se 0.01622501 -0.088641991 0.021382456 -0.12552302
## texture_se -0.71881713 0.009450168 0.007842011 0.04858855
## perimeter_se 0.01739386 -0.039589383 -0.100936382 0.03364523
## area_se 0.02485467 -0.143033086 0.178629163 0.06572712
## smoothness_se 0.18211162 0.089995405 0.100523214 0.11114632
## compactness_se 0.01693787 -0.214913715 -0.307501612 0.75611931
## concavity_se -0.09471984 0.226154507 0.788497259 0.01702619
## concave.points_se -0.08799050 0.672610241 -0.463368226 -0.29208975
## symmetry_se 0.65540293 0.074907975 0.026405642 -0.07517928
## fractal_dimension_se -0.04898647 -0.637635341 -0.124849130 -0.54872845
## PC9 PC10
## radius_se 0.31915726 -0.742675565
## texture_se -0.05112734 -0.002855984
## perimeter_se 0.51822457 0.640508399
## area_se -0.75963699 0.130732798
## smoothness_se 0.02331983 0.024217750
## compactness_se -0.01289135 -0.119255784
## concavity_se 0.11651039 0.027004427
## concave.points_se -0.17123533 -0.012995134
## symmetry_se -0.06466883 -0.002715704
## fractal_dimension_se -0.04616613 0.073272411
worst_prcomp <- prcomp(wbcd_pca[,c(22:31)], scale = TRUE)
summary(worst_prcomp)
## Importance of components%s:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.3869 1.4443 0.89597 0.73531 0.71741 0.42862
## Proportion of Variance 0.5697 0.2086 0.08028 0.05407 0.05147 0.01837
## Cumulative Proportion 0.5697 0.7783 0.85860 0.91267 0.96413 0.98251
## PC7 PC8 PC9 PC10
## Standard deviation 0.28959 0.26802 0.12343 0.06326
## Proportion of Variance 0.00839 0.00718 0.00152 0.00040
## Cumulative Proportion 0.99089 0.99808 0.99960 1.00000
screeplot(worst_prcomp, npcs=4, type="lines")
print(worst_prcomp)
## Standard deviations (1, .., p=10):
## [1] 2.38688848 1.44429302 0.89597293 0.73531379 0.71740732 0.42862478
## [7] 0.28959132 0.26801978 0.12342831 0.06326496
##
## Rotation (n x k) = (10 x 10):
## PC1 PC2 PC3 PC4
## radius_worst -0.3359101 0.40313668 -0.07613333 0.07095866
## texture_worst -0.2007314 0.04257198 0.97682386 -0.00233435
## perimeter_worst -0.3481510 0.37551796 -0.08382037 0.03361042
## area_worst -0.3247392 0.41525563 -0.07902211 0.06609632
## smoothness_worst -0.2486258 -0.33786981 -0.05144303 0.31183688
## compactness_worst -0.3645682 -0.25056608 -0.03801446 -0.26982605
## concavity_worst -0.3747424 -0.13908482 -0.05855486 -0.32050054
## concave.points_worst -0.3976373 0.04168507 -0.13217642 -0.05213711
## symmetry_worst -0.2497528 -0.30860719 -0.02146397 0.77152522
## fractal_dimension_worst -0.2540829 -0.47849501 -0.03601462 -0.34456154
## PC5 PC6 PC7 PC8
## radius_worst -0.0269138039 -0.17376560 0.02580208 0.01497099
## texture_worst -0.0290270596 0.01509874 -0.02645941 -0.04311839
## perimeter_worst 0.0006772917 -0.13172429 -0.02654558 0.09221707
## area_worst -0.0692448712 -0.29437547 0.24876937 0.03172400
## smoothness_worst -0.8263639733 0.07114762 0.09077120 0.16235311
## compactness_worst 0.2021719220 -0.01079188 -0.39766075 0.71502532
## concavity_worst 0.1650942746 0.53132580 0.64845080 -0.03381691
## concave.points_worst -0.0538628284 0.39305063 -0.58217320 -0.54530352
## symmetry_worst 0.4889956064 -0.02864905 0.06597451 -0.04766924
## fractal_dimension_worst 0.0247555394 -0.65021091 0.07683158 -0.38731726
## PC9 PC10
## radius_worst -0.426117589 0.707409982
## texture_worst 0.006193392 -0.006001877
## perimeter_worst -0.459151548 -0.701598949
## area_worst 0.745255816 -0.041754195
## smoothness_worst -0.039457323 -0.006807917
## compactness_worst 0.121416060 0.070202394
## concavity_worst -0.052865741 0.009177221
## concave.points_worst 0.162096574 0.003346891
## symmetry_worst 0.006068817 -0.008600691
## fractal_dimension_worst -0.082179137 -0.020161298
library("factoextra")
fviz_pca_biplot(all_prcomp, col.ind = wbcd$diagnosis, col="black",
palette = "jco", geom = "point", repel=TRUE,
legend.title="Diagnosis", addEllipses = TRUE)
fviz_pca_biplot(mean_prcomp, col.ind = wbcd$diagnosis, col="black",
palette = "jco", geom = "point", repel=TRUE,
legend.title="Diagnosis", addEllipses = TRUE)
fviz_pca_biplot(se_prcomp, col.ind = wbcd$diagnosis, col="black",
palette = "jco", geom = "point", repel=TRUE,
legend.title="Diagnosis", addEllipses = TRUE)
fviz_pca_biplot(worst_prcomp, col.ind = wbcd$diagnosis, col="black",
palette = "jco", geom = "point", repel=TRUE,
legend.title="Diagnosis", addEllipses = TRUE)
Shuffle the wbcd data(100%) & Make train dataset(70%), test dataset(30%)
wbcd1=wbcd[,c(1,8,28,15,5,3,18,11,20,6,13,21,10,26)]
nrows <- NROW(wbcd1)
# set.seed(1) ## fix random value
index <- sample(1:nrows, 0.7 * nrows) ## shuffle and divide
#train <- wbcd ## 569 test data (100%)
train <- wbcd1[index,] ## 398 test data (70%)
test <- wbcd1[-index,] ## 171 test data (30%)
prop.table(table(train$diagnosis))
##
## Benign Malignant
## 0.6306533 0.3693467
prop.table(table(test$diagnosis))
##
## Benign Malignant
## 0.619883 0.380117
our features are: ‘texture_mean’, ‘area_mean’, ‘smoothness_mean’, ‘concavity_mean’, ‘symmetry_mean’, ‘fractal_dimension_mean’, ‘area_se’, ‘smoothness_se’, ‘fractal_dimension_se’, ‘smoothness_worst’, ‘concavity_worst’, ‘symmetry_worst’
library(C50)
library(caret)
learn_c50 <- C5.0(train[,-1],train$diagnosis)
pre_c50 <- predict(learn_c50, test[,-1])
cm_c50 <- confusionMatrix(pre_c50, test$diagnosis)
cm_c50
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 102 11
## Malignant 4 54
##
## Accuracy : 0.9123
## 95% CI : (0.8594, 0.9501)
## No Information Rate : 0.6199
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8099
## Mcnemar's Test P-Value : 0.1213
##
## Sensitivity : 0.9623
## Specificity : 0.8308
## Pos Pred Value : 0.9027
## Neg Pred Value : 0.9310
## Prevalence : 0.6199
## Detection Rate : 0.5965
## Detection Prevalence : 0.6608
## Balanced Accuracy : 0.8965
##
## 'Positive' Class : Benign
##
total_accuracy_c50 <- function(train, test){
accuracy1 <- NULL; accuracy2 <- NULL
for(i in 1:100){
learn_imp_c50 <- C5.0(train[,-1],train$diagnosis,trials = i)
p_c50 <- predict(learn_imp_c50, test[,-1])
accuracy1 <- confusionMatrix(p_c50, test$diagnosis)
accuracy2[i] <- accuracy1$overall[1]
}
accuracy2
}
a <- total_accuracy_c50(train,test)
opt_trials <- which(a==max(a))[1]
learn_imp_c50 <- C5.0(train[,-1],train$diagnosis,trials=opt_trials)
pre_imp_c50 <- predict(learn_imp_c50, test[,-1])
cm_imp_c50 <- confusionMatrix(pre_imp_c50, test$diagnosis)
cm_imp_c50
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 104 7
## Malignant 2 58
##
## Accuracy : 0.9474
## 95% CI : (0.9024, 0.9757)
## No Information Rate : 0.6199
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8866
## Mcnemar's Test P-Value : 0.1824
##
## Sensitivity : 0.9811
## Specificity : 0.8923
## Pos Pred Value : 0.9369
## Neg Pred Value : 0.9667
## Prevalence : 0.6199
## Detection Rate : 0.6082
## Detection Prevalence : 0.6491
## Balanced Accuracy : 0.9367
##
## 'Positive' Class : Benign
##
library(rpart)
learn_rp <- rpart(diagnosis~.,data=train,control=rpart.control(minsplit=2))
pre_rp <- predict(learn_rp, test[,-1], type="class")
cm_rp <- confusionMatrix(pre_rp, test$diagnosis)
cm_rp
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 104 8
## Malignant 2 57
##
## Accuracy : 0.9415
## 95% CI : (0.8951, 0.9716)
## No Information Rate : 0.6199
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8737
## Mcnemar's Test P-Value : 0.1138
##
## Sensitivity : 0.9811
## Specificity : 0.8769
## Pos Pred Value : 0.9286
## Neg Pred Value : 0.9661
## Prevalence : 0.6199
## Detection Rate : 0.6082
## Detection Prevalence : 0.6550
## Balanced Accuracy : 0.9290
##
## 'Positive' Class : Benign
##
learn_pru <- prune(learn_rp, cp=learn_rp$cptable[which.min(learn_rp$cptable[,"xerror"]),"CP"])
pre_pru <- predict(learn_pru, test[,-1], type="class")
cm_pru <-confusionMatrix(pre_pru, test$diagnosis)
cm_pru
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 104 8
## Malignant 2 57
##
## Accuracy : 0.9415
## 95% CI : (0.8951, 0.9716)
## No Information Rate : 0.6199
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8737
## Mcnemar's Test P-Value : 0.1138
##
## Sensitivity : 0.9811
## Specificity : 0.8769
## Pos Pred Value : 0.9286
## Neg Pred Value : 0.9661
## Prevalence : 0.6199
## Detection Rate : 0.6082
## Detection Prevalence : 0.6550
## Balanced Accuracy : 0.9290
##
## 'Positive' Class : Benign
##
library("RWeka")
learn_1r <- OneR(diagnosis~., data=train)
pre_1r <- predict(learn_1r, test[,-1])
cm_1r <- confusionMatrix(pre_1r, test$diagnosis)
cm_1r
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 97 17
## Malignant 9 48
##
## Accuracy : 0.848
## 95% CI : (0.7852, 0.8982)
## No Information Rate : 0.6199
## P-Value [Acc > NIR] : 4.911e-11
##
## Kappa : 0.6695
## Mcnemar's Test P-Value : 0.1698
##
## Sensitivity : 0.9151
## Specificity : 0.7385
## Pos Pred Value : 0.8509
## Neg Pred Value : 0.8421
## Prevalence : 0.6199
## Detection Rate : 0.5673
## Detection Prevalence : 0.6667
## Balanced Accuracy : 0.8268
##
## 'Positive' Class : Benign
##
learn_jrip <- JRip(diagnosis ~ ., data=train)
pre_jrip <- predict(learn_jrip, test[,-1])
cm_jrip <- confusionMatrix(pre_jrip, test$diagnosis)
cm_jrip
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 102 7
## Malignant 4 58
##
## Accuracy : 0.9357
## 95% CI : (0.8878, 0.9675)
## No Information Rate : 0.6199
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8623
## Mcnemar's Test P-Value : 0.5465
##
## Sensitivity : 0.9623
## Specificity : 0.8923
## Pos Pred Value : 0.9358
## Neg Pred Value : 0.9355
## Prevalence : 0.6199
## Detection Rate : 0.5965
## Detection Prevalence : 0.6374
## Balanced Accuracy : 0.9273
##
## 'Positive' Class : Benign
##
library(e1071)
learn_nb <- naiveBayes(train[,-1], train$diagnosis)
pre_nb <- predict(learn_nb, test[,-1])
cm_nb <- confusionMatrix(pre_nb, test$diagnosis)
cm_nb
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 105 10
## Malignant 1 55
##
## Accuracy : 0.9357
## 95% CI : (0.8878, 0.9675)
## No Information Rate : 0.6199
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.8597
## Mcnemar's Test P-Value : 0.01586
##
## Sensitivity : 0.9906
## Specificity : 0.8462
## Pos Pred Value : 0.9130
## Neg Pred Value : 0.9821
## Prevalence : 0.6199
## Detection Rate : 0.6140
## Detection Prevalence : 0.6725
## Balanced Accuracy : 0.9184
##
## 'Positive' Class : Benign
##
total_accuracy_nb <- function(train, test){
library(e1071)
library(caret)
accuracy1 <- NULL; accuracy2 <- NULL
for(i in 1:100){
learn_imp_nb <- naiveBayes(train[,-1], train$diagnosis, laplace=i)
p_nb <- predict(learn_imp_nb, test[,-1])
accuracy1 <- confusionMatrix(p_nb, test$diagnosis)
accuracy2[i] <- accuracy1$overall[1]
}
accuracy2
}
b <- total_accuracy_nb(train,test)
opt_laplace <- which(b==max(b))[1]
learn_imp_nb <- naiveBayes(train[,-1], train$diagnosis, laplace=opt_laplace)
pre_imp_nb <- predict(learn_imp_nb, test[,-1])
cm_imp_nb <- confusionMatrix(pre_imp_nb, test$diagnosis)
cm_imp_nb
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 105 10
## Malignant 1 55
##
## Accuracy : 0.9357
## 95% CI : (0.8878, 0.9675)
## No Information Rate : 0.6199
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.8597
## Mcnemar's Test P-Value : 0.01586
##
## Sensitivity : 0.9906
## Specificity : 0.8462
## Pos Pred Value : 0.9130
## Neg Pred Value : 0.9821
## Prevalence : 0.6199
## Detection Rate : 0.6140
## Detection Prevalence : 0.6725
## Balanced Accuracy : 0.9184
##
## 'Positive' Class : Benign
##
library(randomForest)
learn_rf <- randomForest(diagnosis~., data=train, ntree=100, proximity=T)
pre_rf <- predict(learn_rf, test[,-1])
cm_rf <- confusionMatrix(pre_rf, test$diagnosis)
cm_rf
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 104 9
## Malignant 2 56
##
## Accuracy : 0.9357
## 95% CI : (0.8878, 0.9675)
## No Information Rate : 0.6199
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.8606
## Mcnemar's Test P-Value : 0.07044
##
## Sensitivity : 0.9811
## Specificity : 0.8615
## Pos Pred Value : 0.9204
## Neg Pred Value : 0.9655
## Prevalence : 0.6199
## Detection Rate : 0.6082
## Detection Prevalence : 0.6608
## Balanced Accuracy : 0.9213
##
## 'Positive' Class : Benign
##
library(party)
learn_ct <- ctree(diagnosis~., data=train, controls=ctree_control(maxdepth=2))
pre_ct <- predict(learn_ct, test[,-1])
cm_ct <- confusionMatrix(pre_ct, test$diagnosis)
cm_ct
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 104 21
## Malignant 2 44
##
## Accuracy : 0.8655
## 95% CI : (0.805, 0.9128)
## No Information Rate : 0.6199
## P-Value [Acc > NIR] : 9.936e-13
##
## Kappa : 0.6975
## Mcnemar's Test P-Value : 0.0001746
##
## Sensitivity : 0.9811
## Specificity : 0.6769
## Pos Pred Value : 0.8320
## Neg Pred Value : 0.9565
## Prevalence : 0.6199
## Detection Rate : 0.6082
## Detection Prevalence : 0.7310
## Balanced Accuracy : 0.8290
##
## 'Positive' Class : Benign
##
library(class)
pre_knn <- knn(train = train[,-1], test = test[,-1], cl = train[,1], k=25, prob=T)
cm_knn <- confusionMatrix(pre_knn, test$diagnosis)
cm_knn
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 105 21
## Malignant 1 44
##
## Accuracy : 0.8713
## 95% CI : (0.8117, 0.9176)
## No Information Rate : 0.6199
## P-Value [Acc > NIR] : 2.461e-13
##
## Kappa : 0.7097
## Mcnemar's Test P-Value : 5.104e-05
##
## Sensitivity : 0.9906
## Specificity : 0.6769
## Pos Pred Value : 0.8333
## Neg Pred Value : 0.9778
## Prevalence : 0.6199
## Detection Rate : 0.6140
## Detection Prevalence : 0.7368
## Balanced Accuracy : 0.8337
##
## 'Positive' Class : Benign
##
library(gbm)
test_gbm <- gbm(diagnosis~., data=train, distribution="gaussian",n.trees = 10000,
shrinkage = 0.01, interaction.depth = 4, bag.fraction=0.5, train.fraction=0.5,n.minobsinnode=10,cv.folds=3,keep.data=TRUE,verbose=FALSE,n.cores=1)
best.iter <- gbm.perf(test_gbm, method="cv",plot.it=FALSE)
fitControl = trainControl(method="cv", number=5, returnResamp="all")
learn_gbm = train(diagnosis~., data=train, method="gbm", distribution="bernoulli", trControl=fitControl, verbose=F, tuneGrid=data.frame(.n.trees=best.iter, .shrinkage=0.01, .interaction.depth=1, .n.minobsinnode=1))
pre_gbm <- predict(learn_gbm, test[,-1])
cm_gbm <- confusionMatrix(pre_gbm, test$diagnosis)
cm_gbm
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 105 8
## Malignant 1 57
##
## Accuracy : 0.9474
## 95% CI : (0.9024, 0.9757)
## No Information Rate : 0.6199
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8859
## Mcnemar's Test P-Value : 0.0455
##
## Sensitivity : 0.9906
## Specificity : 0.8769
## Pos Pred Value : 0.9292
## Neg Pred Value : 0.9828
## Prevalence : 0.6199
## Detection Rate : 0.6140
## Detection Prevalence : 0.6608
## Balanced Accuracy : 0.9337
##
## 'Positive' Class : Benign
##
library(rpart)
library(ada)
control <- rpart.control(cp = -1, maxdepth = 14,maxcompete = 1,xval = 0)
learn_ada <- ada(diagnosis~., data = train, test.x = train[,-1], test.y = train[,1], type = "gentle", control = control, iter = 70)
pre_ada <- predict(learn_ada, test[,-1])
cm_ada <- confusionMatrix(pre_ada, test$diagnosis)
cm_ada
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 104 8
## Malignant 2 57
##
## Accuracy : 0.9415
## 95% CI : (0.8951, 0.9716)
## No Information Rate : 0.6199
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8737
## Mcnemar's Test P-Value : 0.1138
##
## Sensitivity : 0.9811
## Specificity : 0.8769
## Pos Pred Value : 0.9286
## Neg Pred Value : 0.9661
## Prevalence : 0.6199
## Detection Rate : 0.6082
## Detection Prevalence : 0.6550
## Balanced Accuracy : 0.9290
##
## 'Positive' Class : Benign
##
learn_svm <- svm(diagnosis~., data=train)
pre_svm <- predict(learn_svm, test[,-1])
cm_svm <- confusionMatrix(pre_svm, test$diagnosis)
cm_svm
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 105 9
## Malignant 1 56
##
## Accuracy : 0.9415
## 95% CI : (0.8951, 0.9716)
## No Information Rate : 0.6199
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.8729
## Mcnemar's Test P-Value : 0.02686
##
## Sensitivity : 0.9906
## Specificity : 0.8615
## Pos Pred Value : 0.9211
## Neg Pred Value : 0.9825
## Prevalence : 0.6199
## Detection Rate : 0.6140
## Detection Prevalence : 0.6667
## Balanced Accuracy : 0.9261
##
## 'Positive' Class : Benign
##
gamma <- seq(0,0.1,0.005)
cost <- 2^(0:5)
parms <- expand.grid(cost=cost, gamma=gamma) ## 231
total_accuracy_svm <- function(train, test){
accuracy1 <- NULL; accuracy2 <- NULL
for(i in 1:NROW(parms)){
learn_svm <- svm(diagnosis~., data=train, gamma=parms$gamma[i], cost=parms$cost[i])
pre_svm <- predict(learn_svm, test[,-1])
accuracy1 <- confusionMatrix(pre_svm, test$diagnosis)
accuracy2[i] <- accuracy1$overall[1]
}
accuracy2
}
c <- total_accuracy_svm(train,test)
opt_parms <- which(c==max(c))[1]
learn_imp_svm <- svm(diagnosis~., data=train, cost=parms$cost[opt_parms], gamma=parms$gamma[opt_parms])
pre_imp_svm <- predict(learn_imp_svm, test[,-1])
cm_imp_svm <- confusionMatrix(pre_imp_svm, test$diagnosis)
cm_imp_svm
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 106 4
## Malignant 0 61
##
## Accuracy : 0.9766
## 95% CI : (0.9412, 0.9936)
## No Information Rate : 0.6199
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9498
## Mcnemar's Test P-Value : 0.1336
##
## Sensitivity : 1.0000
## Specificity : 0.9385
## Pos Pred Value : 0.9636
## Neg Pred Value : 1.0000
## Prevalence : 0.6199
## Detection Rate : 0.6199
## Detection Prevalence : 0.6433
## Balanced Accuracy : 0.9692
##
## 'Positive' Class : Benign
##
col <- c("#ed3b3b", "#0099ff")
par(mfrow=c(3,5))
fourfoldplot(cm_c50$table, color = col, conf.level = 0, margin = 1, main=paste("C5.0 (",round(cm_c50$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_imp_c50$table, color = col, conf.level = 0, margin = 1, main=paste("Improve C5.0 (",round(cm_imp_c50$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_rp$table, color = col, conf.level = 0, margin = 1, main=paste("RPart (",round(cm_rp$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_pru$table, color = col, conf.level = 0, margin = 1, main=paste("Prune (",round(cm_pru$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_1r$table, color = col, conf.level = 0, margin = 1, main=paste("OneR (",round(cm_1r$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_jrip$table, color = col, conf.level = 0, margin = 1, main=paste("JRip (",round(cm_jrip$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_ct$table, color = col, conf.level = 0, margin = 1, main=paste("CTree (",round(cm_ct$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_nb$table, color = col, conf.level = 0, margin = 1, main=paste("NaiveBayes (",round(cm_nb$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_imp_nb$table, color = col, conf.level = 0, margin = 1, main=paste("Improve NaiveBayes\n(",round(cm_imp_nb$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_knn$table, color = col, conf.level = 0, margin = 1, main=paste("KNN (",round(cm_knn$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_rf$table, color = col, conf.level = 0, margin = 1, main=paste("RandomForest (",round(cm_rf$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_gbm$table, color = col, conf.level = 0, margin = 1, main=paste("GBM (",round(cm_gbm$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_ada$table, color = col, conf.level = 0, margin = 1, main=paste("AdaBoost (",round(cm_ada$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_svm$table, color = col, conf.level = 0, margin = 1, main=paste("SVM (",round(cm_svm$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_imp_svm$table, color = col, conf.level = 0, margin = 1, main=paste("Improve SVM (",round(cm_imp_svm$overall[1]*100),"%)",sep=""))
opt_predict <- c(cm_c50$overall[1], cm_imp_c50$overall[1], cm_rp$overall[1], cm_pru$overall[1], cm_1r$overall[1], cm_jrip$overall[1], cm_ct$overall[1], cm_nb$overall[1], cm_imp_nb$overall[1], cm_knn$overall[1], cm_rf$overall[1], cm_gbm$overall[1], cm_ada$overall[1], cm_svm$overall[1], cm_imp_svm$overall[1])
names(opt_predict) <- c("c50","imp_c50","rpart","prune","1r","jrip","ctree","nb","imp_nb","knn","rf","gbm","ada","svm","imp_svm")
best_predict_model <- subset(opt_predict, opt_predict==max(opt_predict))
best_predict_model
## imp_svm
## 0.9766082
our features are: ‘texture_mean’, ‘area_mean’, ‘smoothness_mean’, ‘concavity_mean’, ‘symmetry_mean’, ‘fractal_dimension_mean’, ‘area_se’, ‘smoothness_se’, ‘fractal_dimension_se’, ‘smoothness_worst’, ‘concavity_worst’, ‘symmetry_worst’
wbcd2=wbcd[,c(1,2,3,6,7,10,12,13,16,17,20,22,23,26,27,30)]
nrows <- NROW(wbcd2)
# set.seed(1) ## fix random value
index <- sample(1:nrows, 0.7 * nrows) ## shuffle and divide
#train <- wbcd ## 569 test data (100%)
train <- wbcd2[index,] ## 398 test data (70%)
test <- wbcd2[-index,] ## 171 test data (30%)
library(caret)
library(C50)
learn_c50 <- C5.0(train[,-1],train$diagnosis)
pre_c50 <- predict(learn_c50, test[,-1])
cm_c50 <- confusionMatrix(pre_c50, test$diagnosis)
cm_c50
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 104 4
## Malignant 3 60
##
## Accuracy : 0.9591
## 95% CI : (0.9175, 0.9834)
## No Information Rate : 0.6257
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9123
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9720
## Specificity : 0.9375
## Pos Pred Value : 0.9630
## Neg Pred Value : 0.9524
## Prevalence : 0.6257
## Detection Rate : 0.6082
## Detection Prevalence : 0.6316
## Balanced Accuracy : 0.9547
##
## 'Positive' Class : Benign
##
total_accuracy_c50 <- function(train, test){
accuracy1 <- NULL; accuracy2 <- NULL
for(i in 1:100){
learn_imp_c50 <- C5.0(train[,-1],train$diagnosis,trials = i)
p_c50 <- predict(learn_imp_c50, test[,-1])
accuracy1 <- confusionMatrix(p_c50, test$diagnosis)
accuracy2[i] <- accuracy1$overall[1]
}
accuracy2
}
a <- total_accuracy_c50(train,test)
opt_trials <- which(a==max(a))[1]
learn_imp_c50 <- C5.0(train[,-1],train$diagnosis,trials=opt_trials)
pre_imp_c50 <- predict(learn_imp_c50, test[,-1])
cm_imp_c50 <- confusionMatrix(pre_imp_c50, test$diagnosis)
cm_imp_c50
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 106 2
## Malignant 1 62
##
## Accuracy : 0.9825
## 95% CI : (0.9496, 0.9964)
## No Information Rate : 0.6257
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9624
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9907
## Specificity : 0.9688
## Pos Pred Value : 0.9815
## Neg Pred Value : 0.9841
## Prevalence : 0.6257
## Detection Rate : 0.6199
## Detection Prevalence : 0.6316
## Balanced Accuracy : 0.9797
##
## 'Positive' Class : Benign
##
library(rpart)
learn_rp <- rpart(diagnosis~.,data=train,control=rpart.control(minsplit=2))
pre_rp <- predict(learn_rp, test[,-1], type="class")
cm_rp <- confusionMatrix(pre_rp, test$diagnosis)
cm_rp
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 102 5
## Malignant 5 59
##
## Accuracy : 0.9415
## 95% CI : (0.8951, 0.9716)
## No Information Rate : 0.6257
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8751
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9533
## Specificity : 0.9219
## Pos Pred Value : 0.9533
## Neg Pred Value : 0.9219
## Prevalence : 0.6257
## Detection Rate : 0.5965
## Detection Prevalence : 0.6257
## Balanced Accuracy : 0.9376
##
## 'Positive' Class : Benign
##
learn_pru <- prune(learn_rp, cp=learn_rp$cptable[which.min(learn_rp$cptable[,"xerror"]),"CP"])
pre_pru <- predict(learn_pru, test[,-1], type="class")
cm_pru <-confusionMatrix(pre_pru, test$diagnosis)
cm_pru
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 102 5
## Malignant 5 59
##
## Accuracy : 0.9415
## 95% CI : (0.8951, 0.9716)
## No Information Rate : 0.6257
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8751
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9533
## Specificity : 0.9219
## Pos Pred Value : 0.9533
## Neg Pred Value : 0.9219
## Prevalence : 0.6257
## Detection Rate : 0.5965
## Detection Prevalence : 0.6257
## Balanced Accuracy : 0.9376
##
## 'Positive' Class : Benign
##
library("RWeka")
learn_1r <- OneR(diagnosis~., data=train)
pre_1r <- predict(learn_1r, test[,-1])
cm_1r <- confusionMatrix(pre_1r, test$diagnosis)
cm_1r
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 105 8
## Malignant 2 56
##
## Accuracy : 0.9415
## 95% CI : (0.8951, 0.9716)
## No Information Rate : 0.6257
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8727
## Mcnemar's Test P-Value : 0.1138
##
## Sensitivity : 0.9813
## Specificity : 0.8750
## Pos Pred Value : 0.9292
## Neg Pred Value : 0.9655
## Prevalence : 0.6257
## Detection Rate : 0.6140
## Detection Prevalence : 0.6608
## Balanced Accuracy : 0.9282
##
## 'Positive' Class : Benign
##
learn_jrip <- JRip(diagnosis ~ ., data=train)
pre_jrip <- predict(learn_jrip, test[,-1])
cm_jrip <- confusionMatrix(pre_jrip, test$diagnosis)
cm_jrip
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 102 6
## Malignant 5 58
##
## Accuracy : 0.9357
## 95% CI : (0.8878, 0.9675)
## No Information Rate : 0.6257
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8622
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9533
## Specificity : 0.9062
## Pos Pred Value : 0.9444
## Neg Pred Value : 0.9206
## Prevalence : 0.6257
## Detection Rate : 0.5965
## Detection Prevalence : 0.6316
## Balanced Accuracy : 0.9298
##
## 'Positive' Class : Benign
##
library(e1071)
learn_nb <- naiveBayes(train[,-1], train$diagnosis)
pre_nb <- predict(learn_nb, test[,-1])
cm_nb <- confusionMatrix(pre_nb, test$diagnosis)
cm_nb
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 100 2
## Malignant 7 62
##
## Accuracy : 0.9474
## 95% CI : (0.9024, 0.9757)
## No Information Rate : 0.6257
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8894
## Mcnemar's Test P-Value : 0.1824
##
## Sensitivity : 0.9346
## Specificity : 0.9688
## Pos Pred Value : 0.9804
## Neg Pred Value : 0.8986
## Prevalence : 0.6257
## Detection Rate : 0.5848
## Detection Prevalence : 0.5965
## Balanced Accuracy : 0.9517
##
## 'Positive' Class : Benign
##
total_accuracy_nb <- function(train, test){
library(e1071)
library(caret)
accuracy1 <- NULL; accuracy2 <- NULL
for(i in 1:100){
learn_imp_nb <- naiveBayes(train[,-1], train$diagnosis, laplace=i)
p_nb <- predict(learn_imp_nb, test[,-1])
accuracy1 <- confusionMatrix(p_nb, test$diagnosis)
accuracy2[i] <- accuracy1$overall[1]
}
accuracy2
}
b <- total_accuracy_nb(train,test)
opt_laplace <- which(b==max(b))[1]
learn_imp_nb <- naiveBayes(train[,-1], train$diagnosis, laplace=opt_laplace)
pre_imp_nb <- predict(learn_imp_nb, test[,-1])
cm_imp_nb <- confusionMatrix(pre_imp_nb, test$diagnosis)
cm_imp_nb
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 100 2
## Malignant 7 62
##
## Accuracy : 0.9474
## 95% CI : (0.9024, 0.9757)
## No Information Rate : 0.6257
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8894
## Mcnemar's Test P-Value : 0.1824
##
## Sensitivity : 0.9346
## Specificity : 0.9688
## Pos Pred Value : 0.9804
## Neg Pred Value : 0.8986
## Prevalence : 0.6257
## Detection Rate : 0.5848
## Detection Prevalence : 0.5965
## Balanced Accuracy : 0.9517
##
## 'Positive' Class : Benign
##
library(randomForest)
learn_rf <- randomForest(diagnosis~., data=train, ntree=100, proximity=T)
pre_rf <- predict(learn_rf, test[,-1])
cm_rf <- confusionMatrix(pre_rf, test$diagnosis)
cm_rf
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 106 2
## Malignant 1 62
##
## Accuracy : 0.9825
## 95% CI : (0.9496, 0.9964)
## No Information Rate : 0.6257
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9624
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9907
## Specificity : 0.9688
## Pos Pred Value : 0.9815
## Neg Pred Value : 0.9841
## Prevalence : 0.6257
## Detection Rate : 0.6199
## Detection Prevalence : 0.6316
## Balanced Accuracy : 0.9797
##
## 'Positive' Class : Benign
##
library(party)
learn_ct <- ctree(diagnosis~., data=train, controls=ctree_control(maxdepth=2))
pre_ct <- predict(learn_ct, test[,-1])
cm_ct <- confusionMatrix(pre_ct, test$diagnosis)
cm_ct
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 99 3
## Malignant 8 61
##
## Accuracy : 0.9357
## 95% CI : (0.8878, 0.9675)
## No Information Rate : 0.6257
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8648
## Mcnemar's Test P-Value : 0.2278
##
## Sensitivity : 0.9252
## Specificity : 0.9531
## Pos Pred Value : 0.9706
## Neg Pred Value : 0.8841
## Prevalence : 0.6257
## Detection Rate : 0.5789
## Detection Prevalence : 0.5965
## Balanced Accuracy : 0.9392
##
## 'Positive' Class : Benign
##
library(class)
pre_knn <- knn(train = train[,-1], test = test[,-1], cl = train[,1], k=25, prob=T)
cm_knn <- confusionMatrix(pre_knn, test$diagnosis)
cm_knn
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 106 5
## Malignant 1 59
##
## Accuracy : 0.9649
## 95% CI : (0.9252, 0.987)
## No Information Rate : 0.6257
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9241
## Mcnemar's Test P-Value : 0.2207
##
## Sensitivity : 0.9907
## Specificity : 0.9219
## Pos Pred Value : 0.9550
## Neg Pred Value : 0.9833
## Prevalence : 0.6257
## Detection Rate : 0.6199
## Detection Prevalence : 0.6491
## Balanced Accuracy : 0.9563
##
## 'Positive' Class : Benign
##
library(gbm)
test_gbm <- gbm(diagnosis~., data=train, distribution="gaussian",n.trees = 10000,
shrinkage = 0.01, interaction.depth = 4, bag.fraction=0.5, train.fraction=0.5,n.minobsinnode=10,cv.folds=3,keep.data=TRUE,verbose=FALSE,n.cores=1)
best.iter <- gbm.perf(test_gbm, method="cv",plot.it=FALSE)
fitControl = trainControl(method="cv", number=5, returnResamp="all")
learn_gbm = train(diagnosis~., data=train, method="gbm", distribution="bernoulli", trControl=fitControl, verbose=F, tuneGrid=data.frame(.n.trees=best.iter, .shrinkage=0.01, .interaction.depth=1, .n.minobsinnode=1))
pre_gbm <- predict(learn_gbm, test[,-1])
cm_gbm <- confusionMatrix(pre_gbm, test$diagnosis)
cm_gbm
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 107 7
## Malignant 0 57
##
## Accuracy : 0.9591
## 95% CI : (0.9175, 0.9834)
## No Information Rate : 0.6257
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.9106
## Mcnemar's Test P-Value : 0.02334
##
## Sensitivity : 1.0000
## Specificity : 0.8906
## Pos Pred Value : 0.9386
## Neg Pred Value : 1.0000
## Prevalence : 0.6257
## Detection Rate : 0.6257
## Detection Prevalence : 0.6667
## Balanced Accuracy : 0.9453
##
## 'Positive' Class : Benign
##
library(rpart)
library(ada)
control <- rpart.control(cp = -1, maxdepth = 14,maxcompete = 1,xval = 0)
learn_ada <- ada(diagnosis~., data = train, test.x = train[,-1], test.y = train[,1], type = "gentle", control = control, iter = 70)
pre_ada <- predict(learn_ada, test[,-1])
cm_ada <- confusionMatrix(pre_ada, test$diagnosis)
cm_ada
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 106 4
## Malignant 1 60
##
## Accuracy : 0.9708
## 95% CI : (0.9331, 0.9904)
## No Information Rate : 0.6257
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.937
## Mcnemar's Test P-Value : 0.3711
##
## Sensitivity : 0.9907
## Specificity : 0.9375
## Pos Pred Value : 0.9636
## Neg Pred Value : 0.9836
## Prevalence : 0.6257
## Detection Rate : 0.6199
## Detection Prevalence : 0.6433
## Balanced Accuracy : 0.9641
##
## 'Positive' Class : Benign
##
learn_svm <- svm(diagnosis~., data=train)
pre_svm <- predict(learn_svm, test[,-1])
cm_svm <- confusionMatrix(pre_svm, test$diagnosis)
cm_svm
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 106 1
## Malignant 1 63
##
## Accuracy : 0.9883
## 95% CI : (0.9584, 0.9986)
## No Information Rate : 0.6257
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.975
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9907
## Specificity : 0.9844
## Pos Pred Value : 0.9907
## Neg Pred Value : 0.9844
## Prevalence : 0.6257
## Detection Rate : 0.6199
## Detection Prevalence : 0.6257
## Balanced Accuracy : 0.9875
##
## 'Positive' Class : Benign
##
gamma <- seq(0,0.1,0.005)
cost <- 2^(0:5)
parms <- expand.grid(cost=cost, gamma=gamma) ## 231
total_accuracy_svm <- function(train, test){
accuracy1 <- NULL; accuracy2 <- NULL
for(i in 1:NROW(parms)){
learn_svm <- svm(diagnosis~., data=train, gamma=parms$gamma[i], cost=parms$cost[i])
pre_svm <- predict(learn_svm, test[,-1])
accuracy1 <- confusionMatrix(pre_svm, test$diagnosis)
accuracy2[i] <- accuracy1$overall[1]
}
accuracy2
}
c <- total_accuracy_svm(train,test)
opt_parms <- which(c==max(c))[1]
learn_imp_svm <- svm(diagnosis~., data=train, cost=parms$cost[opt_parms], gamma=parms$gamma[opt_parms])
pre_imp_svm <- predict(learn_imp_svm, test[,-1])
cm_imp_svm <- confusionMatrix(pre_imp_svm, test$diagnosis)
cm_imp_svm
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 107 0
## Malignant 0 64
##
## Accuracy : 1
## 95% CI : (0.9787, 1)
## No Information Rate : 0.6257
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.6257
## Detection Rate : 0.6257
## Detection Prevalence : 0.6257
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : Benign
##
col <- c("#ed3b3b", "#0099ff")
par(mfrow=c(3,5))
fourfoldplot(cm_c50$table, color = col, conf.level = 0, margin = 1, main=paste("C5.0 (",round(cm_c50$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_imp_c50$table, color = col, conf.level = 0, margin = 1, main=paste("Improve C5.0 (",round(cm_imp_c50$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_rp$table, color = col, conf.level = 0, margin = 1, main=paste("RPart (",round(cm_rp$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_pru$table, color = col, conf.level = 0, margin = 1, main=paste("Prune (",round(cm_pru$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_1r$table, color = col, conf.level = 0, margin = 1, main=paste("OneR (",round(cm_1r$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_jrip$table, color = col, conf.level = 0, margin = 1, main=paste("JRip (",round(cm_jrip$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_ct$table, color = col, conf.level = 0, margin = 1, main=paste("CTree (",round(cm_ct$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_nb$table, color = col, conf.level = 0, margin = 1, main=paste("NaiveBayes (",round(cm_nb$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_imp_nb$table, color = col, conf.level = 0, margin = 1, main=paste("Improve NaiveBayes\n(",round(cm_imp_nb$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_knn$table, color = col, conf.level = 0, margin = 1, main=paste("KNN (",round(cm_knn$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_rf$table, color = col, conf.level = 0, margin = 1, main=paste("RandomForest (",round(cm_rf$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_gbm$table, color = col, conf.level = 0, margin = 1, main=paste("GBM (",round(cm_gbm$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_ada$table, color = col, conf.level = 0, margin = 1, main=paste("AdaBoost (",round(cm_ada$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_svm$table, color = col, conf.level = 0, margin = 1, main=paste("SVM (",round(cm_svm$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_imp_svm$table, color = col, conf.level = 0, margin = 1, main=paste("Improve SVM (",round(cm_imp_svm$overall[1]*100),"%)",sep=""))
### 4-5) Select a best prediction model according to high accuracy
opt_predict <- c(cm_c50$overall[1], cm_imp_c50$overall[1], cm_rp$overall[1], cm_pru$overall[1], cm_1r$overall[1], cm_jrip$overall[1], cm_ct$overall[1], cm_nb$overall[1], cm_imp_nb$overall[1], cm_knn$overall[1], cm_rf$overall[1], cm_gbm$overall[1], cm_ada$overall[1], cm_svm$overall[1], cm_imp_svm$overall[1])
names(opt_predict) <- c("c50","imp_c50","rpart","prune","1r","jrip","ctree","nb","imp_nb","knn","rf","gbm","ada","svm","imp_svm")
best_predict_model <- subset(opt_predict, opt_predict==max(opt_predict))
best_predict_model
## imp_svm
## 1
patient <- read.csv("breast_cancer.csv", header=T, stringsAsFactors=F)
patient$X <- NULL
M <- patient[19,] ## 19th patient
M[,c(1,2)] ## Malignant
## id diagnosis
## 19 849014 M
B <- patient[20,] ## 20th patient
B[,c(1,2)] ## Benign
## id diagnosis
## 20 8510426 B
M$diagnosis <- NULL
B$diagnosis <- NULL
Use ‘Improve SVM Algorithm’ as default, Since it’s rated as the best predict_model.
cancer_diagnosis_predict <- function(new, method=learn_imp_svm) {
new_pre <- predict(method, new[,-1])
new_res <- as.character(new_pre)
return(paste("Patient ID: ",new[,1]," => Result: ", new_res, sep=""))
}
default = improve svm
cancer_diagnosis_predict(B)
## [1] "Patient ID: 8510426 => Result: Benign"
Use other ML methods
cancer_diagnosis_predict(B,learn_imp_c50)
## [1] "Patient ID: 8510426 => Result: Benign"
default = improve svm
cancer_diagnosis_predict(M)
## [1] "Patient ID: 849014 => Result: Malignant"
Use other ML methods
cancer_diagnosis_predict(M,learn_imp_c50)
## [1] "Patient ID: 849014 => Result: Malignant"
cancer_summary <- function(new,data) {
## [a] Reshape the new dataset for ggplot
library(reshape2)
m_train <- melt(data, id="diagnosis")
m_new <- melt(new[,-1])
## [b] Variable To Highlight the key factors (geom_vline-RED)
key_factors <- c("radius_mean","perimeter_mean","area_mean","perimeter_worst",
"texture_worst","radius_worst","symmetry_se","compactness_worst",
"concavity_worst","dimension_worst")
key_col <- ifelse(m_new$variable %in% key_factors,"red","black")
## [c] Save mean of Malignant value & colors
library(dplyr)
mal_mean <- subset(data, diagnosis=="Malignant", select=-1)
mal_mean <- apply(mal_mean,2,mean)
library(stringr)
mal_col <- ifelse((round(m_new$value,3) > mal_mean) & (str_count(m_new$variable, 'worst') < 1), "red", "black")
## [d] Save titles : Main title, Patient Diagnosis
title <- "Breast Cancer Diagnosis Plot"
subtitle <- cancer_diagnosis_predict(new)
## â
[e] View plot highlighting your manual key factor
library(ggplot2)
res_key <- ggplot(m_train, aes(x=value,color=diagnosis, fill=diagnosis))+
geom_histogram(aes(y=..density..), alpha=0.5, position="identity", bins=50)+
geom_density(alpha=.2)+
scale_color_manual(values=c("#15c3c9","#f87b72"))+
scale_fill_manual(values=c("#61d4d6","#f5a7a1"))+
geom_vline(data=m_new, aes(xintercept=value),
color=key_col, size=1.5)+
geom_label(data=m_new, aes(x=Inf, y=Inf, label=round(value,3)), nudge_y=2,
vjust = "top", hjust = "right", fill="white", color="black")+
labs(title=paste(title,"(highlight Key Factors)"), subtitle=subtitle)+
theme(plot.title = element_text(face='bold', colour='black', hjust=0.5, size=15))+
theme(plot.subtitle=element_text(lineheight=0.8, hjust=0.5))+
labs(caption="[Training 569 wisc cancer diagnostic patient data]")+
facet_wrap(~variable, scales="free", ncol=5)
## â
[f] View plots highlighting values above average of malignant patient
res_mean <- ggplot(m_train, aes(x=value,color=diagnosis, fill=diagnosis))+
geom_histogram(aes(y=..density..), alpha=0.5, position="identity", bins=50)+
geom_density(alpha=.2)+
scale_color_manual(values=c("#15c3c9","#f87b72"))+
scale_fill_manual(values=c("#61d4d6","#f5a7a1"))+
geom_vline(data=m_new, aes(xintercept=value),
color=mal_col, size=1.5)+
geom_label(data=m_new, aes(x=Inf, y=Inf, label=round(value,3)), nudge_y=2,
vjust = "top", hjust = "right", fill="white", color="black")+
labs(title=paste(title,"(highlight Above malignant average)"), subtitle=subtitle)+
theme(plot.title = element_text(face='bold', colour='black', hjust=0.5, size=15))+
theme(plot.subtitle=element_text(lineheight=0.8, hjust=0.5, size=12))+
labs(caption="[Training 569 wisc cancer diagnostic patient data]")+
facet_wrap(~variable, scales="free", ncol=5)
## [g] output graph
res_mean
#res_key
}
cancer_summary(B, wbcd)
cancer_summary(M, wbcd)
cancer_radar <- function(new,data) {
## [a] Radar Function
coord_radar <- function (theta = "x", start = 0, direction = 1)
{
theta <- match.arg(theta, c("x", "y"))
r <- ifelse(theta == "x", "y", "x")
ggproto("CoordRadar", CoordPolar, theta = theta, r = r, start = start,
direction = sign(direction),
is_linear = function(coord) TRUE)
}
## [b] Normalize Function -> you can use rescale instead.
normalize <- function(x) {
return((x-min(x))/(max(x)-min(x)))
}
## [c] Get average from Normal(Benign) Data to set standards (Grey area)
b1 <- subset(data, diagnosis=="Benign", select=-1)
b2 <- as.data.frame(lapply(b1,normalize))
be <- colMeans(b2)
## [d] Normalize Patient Data to compare with normal dataset
p_new <- (new[,-1]-apply(b1,2,min))/(apply(b1,2,max)-apply(b1,2,min))
max_value <- max(p_new)
## [e] Combine Two data (Normal, Patient)
cc_radar <- rbind(be,p_new)
cc_radar <- cbind(group=c("Normal","Patient"),cc_radar)
coc <- melt(cc_radar, id="group")
library(stringr)
coc$variable <- as.character(coc$variable)
coc$variable[str_count(coc$variable,'\\_')>1] <- sub('_', '.', coc$variable[str_count(coc$variable,'\\_')>1])
name <- unlist(strsplit(as.character(coc$variable),"_"))
coc$feature <- name[c(seq(1,length(name),2))]
coc$type <- name[c(seq(2,length(name),2))]
coc$variable <- NULL
df <- coc[order(coc$feature),]
## [f] Save titles : Main title, Patient Diagnosis
title <- "Breast Cancer Diagnosis Radar"
subtitle <- cancer_diagnosis_predict(new)
## â
[g] Radar plot
res <- ggplot(df, aes(x=feature,y=value,group=group,fill=group,color=group))+
geom_point()+geom_polygon(alpha=0.3)+coord_radar()+ylim(0,max_value)+
scale_color_manual(values=c(NA,"#b10000"))+
scale_fill_manual(values=c("#8e8e8e",NA))+
facet_wrap(~type)+
theme(panel.background=element_rect(fill = "white", colour= NA),
panel.border=element_rect(fill = NA, colour="grey50"),
panel.grid.major=element_line(colour = "grey90", size = 0.2),
panel.grid.minor=element_line(colour = "grey98", size = 0.5),
legend.position="bottom",
strip.background = element_rect(fill = "grey80", colour = "grey50"),
axis.text.y=element_text(colour=NA),
axis.title.y=element_text(colour=NA),
axis.ticks=element_line(colour = NA))+
xlab("")+ylab("")+
labs(title=title, subtitle=subtitle)+
theme(plot.title = element_text(face='bold', colour='black', hjust=0.5, size=15))+
theme(plot.subtitle=element_text(lineheight=0.8, hjust=0.5, size=12))+
labs(caption="[Training 569 wisc cancer diagnostic patient data]")
## [h] output graph
res
}
cancer_radar(B,wbcd)
cancer_radar(M,wbcd)